LASSO regression using thresholded NMF corr genes?
Pseudobulk Expression
pseudobulk_vst <- readRDS("../BALLbulk_Deconvolution/BALL_89pt_pseudobulk_vst.rds")
pseudobulk_vst
An object of class Seurat
29105 features across 89 samples within 1 assay
Active assay: RNA (29105 features, 0 variable features)
NMF_ptscores <- read_csv('../CompositionAnalysis/BALL_Composition_DevState_NMFscores.csv') %>%
left_join(read_csv('scBALL_IDconversion.csv')) %>%
select(ID_Bulk = ID, contains('NMF')) %>%
pivot_longer(-ID_Bulk, names_to = 'NMF', values_to = 'NMFscore') %>%
left_join(pseudobulk_vst@meta.data %>% select(ID_Bulk) %>% rownames_to_column('ID_scRNA') ) %>%
mutate(Lineage = NMF %>% str_replace('.*_',''), NMF = NMF %>% str_replace('_.*','')) %>%
select(Patient = ID_scRNA, NMF, Lineage, NMFscore)
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
TB = col_character(),
NMF1_ProB = col_double(),
NMF2_EarlyLymphoid = col_double(),
NMF3_Erythroid = col_double(),
NMF4_PreB = col_double(),
NMF5_MatureB = col_double(),
NMF6_MyeloidProg = col_double(),
NMF7_pDC = col_double(),
NMF8_HSCMPPLMPP = col_double(),
NMF9_Monocyte = col_double(),
NMF10_TNK = col_double()
)
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
Patient = col_character(),
TB = col_character(),
ID = col_character(),
Sample_ID = col_character()
)
Joining with `by = join_by(TB)`Joining with `by = join_by(ID_Bulk)`
NMF_ptscores
NMFcorr <- read_csv('NMF_GeneCorr_Thresholding.csv') %>%
filter(threshold == 'pass', qvalue < 0.01) %>% arrange(qvalue) %>% arrange(NMF)
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
NMF = col_character(),
Gene = col_character(),
pearson = col_double(),
pvalue = col_double(),
qvalue = col_double(),
pos_K1_threshold = col_double(),
neg_K1_threshold = col_double(),
threshold = col_character()
)
NMFcorr
Define feature space - load markers
LinDE_FDR01_genes <- read_csv('BALL_DEresults_NMF_Lineage.csv') %>% filter(padj < 0.01, stat > 0) %>% pull(Gene) %>% unique()
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
Gene = col_character(),
baseMean = col_double(),
log2FoldChange = col_double(),
lfcSE = col_double(),
stat = col_double(),
pvalue = col_double(),
padj = col_double(),
Lineage = col_character()
)
LinDE_FDR05_genes <- read_csv('BALL_DEresults_NMF_Lineage.csv') %>% filter(padj < 0.05, stat > 0) %>% pull(Gene) %>% unique()
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
Gene = col_character(),
baseMean = col_double(),
log2FoldChange = col_double(),
lfcSE = col_double(),
stat = col_double(),
pvalue = col_double(),
padj = col_double(),
Lineage = col_character()
)
BDevDE_FDR01_genes <- read_csv('../BDevelopment_Characterization/BDevelopment_CellType_DEresults.csv') %>%
filter(padj < 0.01, stat > 0) %>% pull(Gene) %>% unique()
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
Gene = col_character(),
baseMean = col_double(),
log2FoldChange = col_double(),
lfcSE = col_double(),
stat = col_double(),
pvalue = col_double(),
padj = col_double(),
CellType = col_character()
)
BDevDE_FDR05_genes <- read_csv('../BDevelopment_Characterization/BDevelopment_CellType_DEresults.csv') %>%
filter(padj < 0.05, stat > 0) %>% pull(Gene) %>% unique()
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
Gene = col_character(),
baseMean = col_double(),
log2FoldChange = col_double(),
lfcSE = col_double(),
stat = col_double(),
pvalue = col_double(),
padj = col_double(),
CellType = col_character()
)
NMFcorr <- NMFcorr %>% mutate(LinDE_FDR05 = Gene %in% LinDE_FDR05_genes,
LinDE_FDR01 = Gene %in% LinDE_FDR01_genes,
BDevDE_FDR01 = Gene %in% BDevDE_FDR01_genes,
BDevDE_FDR05 = Gene %in% BDevDE_FDR05_genes)
NMFcorr
evaluate_model <- function(model, x_val, anno_val, lambda,
feature_name, iteration, foldname){
# Create score classification with survival and get covariates
pred_y <- predict(model, x_val, s = lambda) %>% data.frame()
colnames(pred_y) <- 'PredScore'
pred_y <- pred_y %>% rownames_to_column('Patient') %>%
# add anno to get covariates
left_join(anno_val, by = 'Patient')
# Calculate correlation in validation set
pearson <- cor(pred_y$PredScore, pred_y$NMFscore, method = 'pearson')
spearman <- cor(pred_y$PredScore, pred_y$NMFscore, method = 'spearman')
# Summary Metrics
summary_metrics <- data.frame(
'model_id' = paste0(feature_name, '_iter', iteration, '_', foldname),
'lambda' = lambda,
'model_size' = sum(coef(model, s = lambda)!=0),
'pearson' = pearson,
'spearman' = spearman,
'features' = feature_name,
'iteration' = iteration,
'foldname' = foldname
)
return(summary_metrics)
}
gridsearch_lasso <- function(expr_train, expr_val, anno_train, anno_val, features, feature_name,
iteration, foldname, summary_metrics){
# Filter expr matrix for feature set
x_train <- expr_train[, colnames(expr_train) %in% features]
x_val <- expr_val[, colnames(expr_val) %in% features]
# Train LASSO
model <- train_LASSO(x_train, anno_train)
# Get summary metrics for lambda.min and lambda.1se
for(lambda in c('lambda.min', 'lambda.1se')){
summary_metrics <- summary_metrics %>% rbind(
evaluate_model(model = model, x_val = x_val, anno_val = anno_val, lambda = lambda,
feature_name = feature_name, iteration = iteration, foldname = foldname))
}
return(summary_metrics)
}
nestedCV_regression <- function(train_anno, train_expr, iteration, feature_sets, summary_metrics){
# set up random seed and shuffle data
set.seed(iteration)
train_anno <- train_anno[sample(nrow(train_anno)),]
train_expr <- train_expr[sample(nrow(train_expr)),]
## 5-fold outer cross validation
folds <- rsample::vfold_cv(train_anno, 5)
for(outer_cv in 1:5){
# fold ID
foldname <- folds$id[[outer_cv]]
# get anno splits
anno_train <- analysis(folds$splits[[outer_cv]])
anno_val <- assessment(folds$splits[[outer_cv]])
# get expr splits
expr_train <- train_expr[anno_train$Patient,]
expr_val <- train_expr[anno_val$Patient,]
# Iterate through feature set and run gridsearch to train survival functions
for(feature_name in names(feature_sets)){
# get feature list
features <- feature_sets[[feature_name]]
# run gridsearch and get results
summary_metrics <- gridsearch_lasso(expr_train = expr_train, expr_val = expr_val, anno_train = anno_train, anno_val = anno_val,
features = features, feature_name = feature_name, iteration = iteration, foldname = foldname,
summary_metrics = summary_metrics)
}
}
return(summary_metrics)
}
library(tidymodels)
library(glmnet)
output <- data.frame()
train_x <- pseudobulk_vst@assays$RNA@data[,unique(NMF_ptscores$Patient)] %>% data.matrix() %>% t()
for(NMFcomp in c('NMF1', 'NMF2', 'NMF3', 'NMF4', 'NMF5', 'NMF6', 'NMF7', 'NMF8', 'NMF9', 'NMF10')){
print(paste0('NMF Component: ', NMFcomp))
temp_output <- data.frame()
train_y <- NMF_ptscores %>% filter(NMF == NMFcomp) %>% select(Patient, NMFscore)
featurespace <- list('PosCorr_LinDE_FDR05' = NMFcorr %>% filter(NMF == NMFcomp, LinDE_FDR05 == TRUE, pearson > 0) %>% pull(Gene),
'PosCorr_LinDE_FDR01' = NMFcorr %>% filter(NMF == NMFcomp, LinDE_FDR01 == TRUE, pearson > 0) %>% pull(Gene),
'PosCorr_LinDE_BDevDE_FDR05' = NMFcorr %>% filter(NMF == NMFcomp, LinDE_FDR05 == TRUE, BDevDE_FDR05 == TRUE, pearson > 0) %>% pull(Gene),
'PosCorr_LinDE_BDevDE_FDR01' = NMFcorr %>% filter(NMF == NMFcomp, LinDE_FDR01 == TRUE, BDevDE_FDR01 == TRUE, pearson > 0) %>% pull(Gene),
'AnyCorr_LinDE_FDR05' = NMFcorr %>% filter(NMF == NMFcomp, LinDE_FDR05 == TRUE) %>% pull(Gene),
'AnyCorr_LinDE_FDR01' = NMFcorr %>% filter(NMF == NMFcomp, LinDE_FDR01 == TRUE) %>% pull(Gene),
'AnyCorr_LinDE_BDevDE_FDR05' = NMFcorr %>% filter(NMF == NMFcomp, LinDE_FDR05 == TRUE, BDevDE_FDR05 == TRUE) %>% pull(Gene),
'AnyCorr_LinDE_BDevDE_FDR01' = NMFcorr %>% filter(NMF == NMFcomp, LinDE_FDR01 == TRUE, BDevDE_FDR01 == TRUE) %>% pull(Gene)
)
for(iteration in 1:10){
print(paste0('iteration ', iteration))
temp_output <- nestedCV_regression(train_anno = train_y, train_expr = train_x, iteration = iteration, feature_sets = featurespace,
summary_metrics = temp_output)
}
## annotate and add to final output
output <- bind_rows(output, temp_output %>% mutate(NMF = NMFcomp))
}
output %>% write_csv('RepNestedCV_results_NMFregression.csv')
5-Fold cross validation with 10 repeats within the pseudobulk to estimate the best parameters and get a gestalt of the overall accuracy After choosing the best combination of parameters we will test on the bulk RNA-seq dataset.
output <- read_csv('RepNestedCV_results_NMFregression.csv')
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
model_id = col_character(),
lambda = col_character(),
model_size = col_double(),
pearson = col_double(),
spearman = col_double(),
features = col_character(),
iteration = col_double(),
foldname = col_character(),
NMF = col_character()
)
output %>% pull(features) %>% table()
.
AnyCorr_LinDE_BDevDE_FDR01 AnyCorr_LinDE_BDevDE_FDR05 AnyCorr_LinDE_FDR01 AnyCorr_LinDE_FDR05
1000 1000 1000 1000
PosCorr_LinDE_BDevDE_FDR01 PosCorr_LinDE_BDevDE_FDR05 PosCorr_LinDE_FDR01 PosCorr_LinDE_FDR05
1000 1000 1000 1000
output %>%
mutate(NMF = factor(NMF, levels = c('NMF1', 'NMF2', 'NMF3', 'NMF4', 'NMF5', 'NMF6', 'NMF7', 'NMF8', 'NMF9', 'NMF10'))) %>%
ggplot(aes(x = reorder(features, -model_size), y = model_size, fill = lambda)) +
geom_hline(yintercept = 10, lty = 2) + geom_hline(yintercept = 20, lty = 2) +
geom_hline(yintercept = 30, lty = 2) + geom_hline(yintercept = 40, lty = 2) +
geom_boxplot(outlier.size = 0.8) + ggbeeswarm::geom_quasirandom(dodge.width = 0.7, size = 0.2, alpha = 0.7) +
facet_wrap(.~NMF, ncol = 6) + theme_pubr() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
stat_compare_means(label = 'p.signif')
NA
output %>%
mutate(NMF = factor(NMF, levels = c('NMF1', 'NMF2', 'NMF3', 'NMF4', 'NMF5', 'NMF6', 'NMF7', 'NMF8', 'NMF9', 'NMF10'))) %>%
ggplot(aes(x = reorder(features, -pearson), y = pearson, fill = lambda)) +
geom_boxplot(outlier.size = 0.8) + ggbeeswarm::geom_quasirandom(dodge.width = 0.7, size = 0.2, alpha = 0.7) +
facet_wrap(.~NMF, ncol = 6) + theme_pubr() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
stat_compare_means(label = 'p.signif')
NA
In summary, using combination of positively and negatively correlated genes leads to better performance, particularly for: NMF1 (Pro-B) NMF5 (Erythroid) NMF11 (Naive T)
In terms of filtering of correlated genes; Lin DE BDev DE FDR < 0.01 as a filter consistently lead to the best performance.
Lambda choice of Minimum + 1SE resulted in model size reductions of nearly 20 genes without sacrificing performance by nested CV.
output_CVmedians <- output %>% filter(lambda == 'lambda.1se', features == 'AnyCorr_LinDE_BDevDE_FDR01') %>% #AnyCorr_LinDE_FDR01. AnyCorr_LinDE_BDevDE_FDR01
select(NMF, model_size, pearson, spearman) %>%
group_by(NMF) %>% summarise_all(median) %>% arrange(pearson)
output_CVmedians
Train from both positively corr and any corr. If models from both approaches have negative coefficients, use any corr. Also make sure that the genes are present in the bulk RNAseq data
train_LASSO <- function(x_train, y_train, alpha = 1){
train_y <- y_train$NMFscore
# Perform Lasso regression with LOOCV
model <- cv.glmnet(x = x_train, y = train_y, nfold = dim(x_train)[1], family = 'gaussian', alpha = alpha, maxit=1000000, standardize=FALSE)
#plot(model)
return(model)
}
bulkRNAgenes <- data.table::fread("../BALLbulk_Deconvolution/BALL_bulkRNA_data/BALL_BulkRNAseq_subsetgene_rawcounts.txt")$Gene
bulkRNAgenes %>% length()
[1] 36110
set.seed(123)
model_list <- list()
# subset NMF corr with genes present in the bulk RNA data
NMFcorr <- NMFcorr %>% filter(Gene %in% bulkRNAgenes)
for(NMFcomp in c('NMF1', 'NMF2', 'NMF3', 'NMF4', 'NMF5', 'NMF6', 'NMF7', 'NMF8', 'NMF9', 'NMF10')){
# Define y variable; NMF score
train_y <- NMF_ptscores %>% filter(NMF == NMFcomp) %>% select(Patient, NMFscore)
# Define feature space to train from
feature_space <- NMFcorr %>% filter(NMF == NMFcomp, LinDE_FDR01 == TRUE, BDevDE_FDR01 == TRUE) %>% pull(Gene) #BDevDE_FDR01 == TRUE
# subset training set
train_x <- pseudobulk_vst@assays$RNA@data[feature_space, train_y$Patient] %>% data.matrix() %>% t()
model <- train_LASSO(train_x, y_train = train_y)
model_list[[NMFcomp]] <- model
}
Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per foldWarning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per foldWarning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per foldWarning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per foldWarning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per foldWarning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per foldWarning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per foldWarning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per foldWarning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per foldWarning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
# model weights
modelweights <- data.frame()
for(NMFcomp in c('NMF1', 'NMF2', 'NMF3', 'NMF4', 'NMF5', 'NMF6', 'NMF7', 'NMF8', 'NMF9', 'NMF10')){
modelweights <- modelweights %>% bind_rows(
model_list[[NMFcomp]] %>% coef(s = 'lambda.1se') %>% data.matrix() %>%
data.frame() %>% dplyr::rename(Weight = s1) %>% rownames_to_column('Gene') %>%
tail(-1) %>% filter(Weight != 0) %>% arrange(-Weight) %>% mutate(Model = NMFcomp)
)
}
modelweights <- modelweights %>% select(Model, Gene, Weight)
modelweights %>% group_by(Model) %>% summarise(count = n())
# model weights
modelweights <- data.frame()
for(NMFcomp in c('NMF1', 'NMF2', 'NMF3', 'NMF4', 'NMF5', 'NMF6', 'NMF7', 'NMF8', 'NMF9', 'NMF10')){
modelweights <- modelweights %>% bind_rows(
model_list[[NMFcomp]] %>% coef(s = 'lambda.1se') %>% data.matrix() %>%
data.frame() %>% dplyr::rename(Weight = s1) %>% rownames_to_column('Gene') %>%
tail(-1) %>% filter(Weight != 0) %>% arrange(-Weight) %>% mutate(Model = NMFcomp)
)
}
modelweights <- modelweights %>% select(Model, Gene, Weight)
modelweights %>% group_by(Model) %>% summarise(count = n())
modelweights %>%
left_join(NMFconvert %>% dplyr::rename(Model = NMF)) %>%
mutate(coefficient = ifelse(Weight > 0, 'Positive', 'Negative') %>% factor(levels = c('Positive', 'Negative'))) %>%
group_by(NMFnamed, coefficient) %>% summarise(count = n()) %>%
ggplot(aes(x = NMFnamed, y = count, fill = coefficient)) + geom_col() + ggpubr::theme_pubr() +
ggsci::scale_fill_simpsons() + theme(axis.text.x = element_text(angle = 90, hjust = 1))
Joining with `by = join_by(Model)``summarise()` has grouped output by 'NMFnamed'. You can override using the `.groups` argument.
NMFnamed_levels <- c('HSC_MPP', 'Myeloid_Prog', 'Pre_pDC', 'Early_Lymphoid', 'Pro_B', 'Pre_B',
'Mature_B', 'Erythroid', 'Monocyte', 'T_NK')
NMFconvert <- data.frame(
'NMF' = c('NMF8', 'NMF6', 'NMF7', 'NMF2', 'NMF1', 'NMF4',
'NMF5', 'NMF3', 'NMF9', 'NMF10') %>% factor(),
'NMFnamed' = NMFnamed_levels %>% factor(levels = NMFnamed_levels)
)
NMFconvert
modelweights <- modelweights %>% left_join(NMFconvert %>% dplyr::rename(Model = NMF)) %>%
arrange(Gene) %>% arrange(NMFnamed) %>% pivot_wider(id_cols=Gene, names_from=NMFnamed, values_from=Weight) %>% replace(is.na(.), 0)
Joining with `by = join_by(Model)`
modelweights %>% write_csv("NMF_Lasso_ModelWeights.csv")
modelweights
calculate_NMFscores = function(query, modelweights, scale = TRUE, sampleID = 'Patient'){
# Check for overlap with model genes and query genes
querygenes <- rownames(query)
modelweights_missing <- sum(!(modelweights$Gene %in% querygenes))
# check for missing genes
if(modelweights_missing > 0){
print(paste0('Warning: ', modelweights_missing, ' genes from NMF models are missing from query dataset'))
}
# filter model weights
modelweights <- modelweights %>% filter(Gene %in% querygenes)
modelweights_mat <- modelweights %>% column_to_rownames('Gene') %>% data.matrix()
# multiply query by NMF lasso weights
scored <- (t(query[modelweights$Gene,]) %*% modelweights_mat) %>% data.matrix()
if(scale == TRUE){
scored <- scale(scored)
}
scored <- scored %>% as.data.frame() %>% rownames_to_column(sampleID)
return(scored)
}
modelweights <- read_csv("NMF_Lasso_ModelWeights.csv")
── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
Gene = col_character(),
HSC_MPP = col_double(),
Myeloid_Prog = col_double(),
Pre_pDC = col_double(),
Early_Lymphoid = col_double(),
Pro_B = col_double(),
Pre_B = col_double(),
Mature_B = col_double(),
Erythroid = col_double(),
Monocyte = col_double(),
T_NK = col_double()
)
modelweights
pseudobulk_vst <- readRDS("../BALLbulk_Deconvolution/BALL_89pt_pseudobulk_vst.rds")
# calculate NMF scores from vst-normalized data
pseudobulk_vst_NMFscores <- calculate_NMFscores(pseudobulk_vst@assays$RNA@data, modelweights, scale = T, sampleID = 'Patient')
pseudobulk_vst_NMFscores[1:10,1:10]
Training results - pseudobulk
NMF_compare <- NMF_ptscores %>%
left_join(NMFconvert) %>%
left_join(pseudobulk_vst_NMFscores %>% pivot_longer(-Patient, names_to = 'NMFnamed', values_to = 'predNMF'))
Joining with `by = join_by(NMF)`Joining with `by = join_by(Patient, NMFnamed)`
NMF_compare %>%
mutate(NMFnamed = factor(NMFnamed, levels = NMFnamed_levels)) %>%
ggplot(aes(x = predNMF, y = NMFscore)) +
geom_point() + geom_smooth(method = 'lm') +
facet_wrap(.~NMFnamed, scales = 'free') +
theme_pubr() + stat_cor()
bulkRNA_counts <- data.table::fread("../BALLbulk_Deconvolution/BALL_bulkRNA_data/BALL_BulkRNAseq_subsetgene_rawcounts.txt") %>%
column_to_rownames('Gene') %>% data.matrix()
bulkRNA_counts[1:10,1:10]
SJBALL020608_D1 SJBALL082_D SJINF074_D SJBALL209_D SJINF049_D SJALL050848_D1 SJBALL016239_D1 SJBALL205_D SJBALL016303_D1 SJBALL016244_D1
A1BG 143 317 182 122 82 252 1090 298 78 51
A1BG-AS1 147 771 279 326 199 307 921 552 119 111
A1CF 1 2 1 0 0 0 0 0 0 1
A2M 11 8 20 75 15 4 4 8 1 88
A2M-AS1 6 198 5 489 26 43 15 38 23 36
A2ML1 0 0 2 0 1 2 0 0 0 0
A2ML1-AS1 0 1 0 0 0 0 0 0 0 0
A2ML1-AS2 0 0 0 0 0 0 1 0 0 0
A3GALT2 2 1 2 0 2 2 0 0 5 0
A4GALT 59 32 60 60 34 4 37 18 2 44
library(DESeq2)
bulkRNA_dds <- DESeqDataSetFromMatrix(bulkRNA_counts[rowSums(bulkRNA_counts) >= 10,],
colData = data.frame('Patient' = colnames(bulkRNA_counts)) %>% column_to_rownames('Patient'),
design = ~1)
bulkRNA_vst <- assay(vst(bulkRNA_dds))
rm(bulkRNA_counts, bulkRNA_dds)
bulkRNA_vst[1:10,1:10]
SJBALL020608_D1 SJBALL082_D SJINF074_D SJBALL209_D SJINF049_D SJALL050848_D1 SJBALL016239_D1 SJBALL205_D
A1BG 7.140496 7.459732 7.069288 6.857087 6.789549 8.334186 9.654538 8.049401
A1BG-AS1 7.171814 8.579933 7.566172 8.013636 7.813683 8.592772 9.422367 8.855233
A1CF 4.393532 4.420817 4.350597 4.088563 4.088563 4.088563 4.088563 4.088563
A2M 5.082162 4.748766 5.231875 6.364534 5.376806 4.830286 4.673378 4.948642
A2M-AS1 4.828826 6.923254 4.671326 8.537353 5.748445 6.318600 5.201054 5.874671
A2ML1 4.088563 4.088563 4.458630 4.088563 4.431539 4.615895 4.088563 4.088563
A2ML1-AS1 4.088563 4.323761 4.088563 4.088563 4.088563 4.088563 4.088563 4.088563
A2ML1-AS2 4.088563 4.088563 4.088563 4.088563 4.088563 4.088563 4.382468 4.088563
A3GALT2 4.519059 4.323761 4.458630 4.088563 4.572475 4.615895 4.088563 4.088563
A4GALT 6.232613 5.377247 5.982178 6.159561 5.959306 4.830286 5.780731 5.356654
SJBALL016303_D1 SJBALL016244_D1
A1BG 7.804691 7.006957
A1BG-AS1 8.341231 7.928454
A1CF 4.088563 4.568334
A2M 4.632465 7.641534
A2M-AS1 6.442545 6.636279
A2ML1 4.088563 4.088563
A2ML1-AS1 4.088563 4.088563
A2ML1-AS2 4.088563 4.088563
A3GALT2 5.278005 4.088563
A4GALT 4.853334 6.846260
# calculate NMF scores from vst-normalized data
bulkRNA_vst_NMFscores <- calculate_NMFscores(bulkRNA_vst, modelweights, scale = T, sampleID = 'Patient')
bulkRNA_vst_NMFscores[1:10,1:10]
Validation results - matched bulkRNAseq
NMF_ptscores_converted <- NMF_ptscores %>%
left_join(read_delim("../BALL_metadata_20230105.txt", delim = '\t') %>% select(Patient = Directory, Sample, ID, TB))
── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
Sample = col_character(),
ID = col_character(),
Directory = col_character(),
TB = col_character(),
age = col_double(),
sex = col_character(),
diagnosis = col_character(),
subdiagnosis = col_character(),
location = col_character()
)
Joining with `by = join_by(Patient)`
NMF_ptscores_compareBulkRNA <- NMF_ptscores_converted %>%
left_join(NMFconvert) %>%
left_join(bulkRNA_vst_NMFscores %>% pivot_longer(-Patient, names_to = 'NMFnamed', values_to = 'predNMF') %>%
dplyr::rename(ID = Patient))
Joining with `by = join_by(NMF)`Joining with `by = join_by(ID, NMFnamed)`
NMF_ptscores_compareBulkRNA
NMF_ptscores_compareBulkRNA %>%
mutate(NMFnamed = factor(NMFnamed, NMFnamed_levels)) %>%
ggplot(aes(x = predNMF, y = NMFscore)) +
geom_point() + geom_smooth(method = 'lm') +
facet_wrap(.~NMFnamed, scales = 'free', ncol = 5) +
theme_pubr() + stat_cor() +
ylab('NMF Lineage Score (scRNA composition)') + xlab('Predicted NMF Score (matched bulk RNA-seq)')
NMF_ptscores_compareBulkRNA %>%
mutate(NMFnamed = factor(NMFnamed, NMFnamed_levels)) %>%
ggplot(aes(x = predNMF, y = NMFscore)) +
geom_point() + geom_smooth(method = 'lm') +
facet_wrap(.~NMFnamed, scales = 'free', ncol = 2) +
theme_pubr() + stat_cor() +
ylab('NMF Lineage Score (scRNA composition)') + xlab('Predicted NMF Score (matched bulk RNA-seq)')
Not bad!! Save bulk RNAseq NMF scores:
bulkRNA_vst_NMFscores %>% write_csv('Bulk2046_NMFregression_LineageScores.csv')
bulkRNA_vst_NMFscores
output %>%
left_join(NMFconvert) %>%
#mutate(NMF = factor(NMF, levels = c('NMF1', 'NMF2', 'NMF3', 'NMF4', 'NMF5', 'NMF6', 'NMF7', 'NMF8', 'NMF9', 'NMF10'))) %>%
filter(features == 'AnyCorr_LinDE_BDevDE_FDR01', lambda == 'lambda.1se') %>%
ggplot(aes(x = NMFnamed, y = pearson, fill = NMFnamed)) +
theme_pubr(legend = 'none') + geom_hline(yintercept = 0.5, lty = 2) + geom_hline(yintercept = 0.75, lty = 2) + geom_hline(yintercept = 0.9, lty = 2) +
geom_boxplot(outlier.size = 0) + ggbeeswarm::geom_quasirandom(size = 1, alpha = 0.7) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
Joining with `by = join_by(NMF)`
NA
output %>%
left_join(NMFconvert) %>%
#mutate(NMF = factor(NMF, levels = c('NMF1', 'NMF2', 'NMF3', 'NMF4', 'NMF5', 'NMF6', 'NMF7', 'NMF8', 'NMF9', 'NMF10'))) %>%
filter(features == 'AnyCorr_LinDE_BDevDE_FDR01', lambda == 'lambda.1se') %>%
ggplot(aes(x = reorder(NMFnamed, -pearson), y = pearson, fill = NMFnamed)) +
theme_pubr(legend = 'none') + geom_hline(yintercept = 0.5, lty = 2) + geom_hline(yintercept = 0.75, lty = 2) + geom_hline(yintercept = 0.9, lty = 2) +
geom_boxplot(outlier.size = 0) + ggbeeswarm::geom_quasirandom(size = 1, alpha = 0.7) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
NMF_ptscores_compareBulkRNA %>% drop_na()
cb_fractions = data.table::fread('../../../../../../../Dormancy/HSC_analysis/HSC_byOntogeny/Dicklab_sortedFractions_Batch4_CB_Hierarchy_vst.csv')
cb_fractions <- cb_fractions %>% column_to_rownames('Gene') %>% data.matrix()
cb_fractions %>% dim()
[1] 43165 107
CBfractions_vst_NMFscores <- calculate_NMFscores(cb_fractions, modelweights, scale = T, sampleID = 'Sample')
[1] "Warning: 1 genes from NMF models are missing from query dataset"
CBfractions_vst_NMFscores
CBfractions_vst_NMFscores %>%
pivot_longer(-Sample, names_to = 'NMFsig', values_to = 'Score') %>%
mutate(NMFsig = factor(NMFsig, levels = NMFnamed_levels),
Population = Sample %>% str_replace('.*CB_',''),
Population = factor(Population, levels = c('HSC', 'MPP', 'LMPP', 'CMP', 'GMP', 'MLPII', 'EarlyProB', 'PreProB',
'ProB', 'PreB', 'B', #'B1', 'B2', 'B3', 'B4', 'B5', 'B6',
'T', 'NK', 'EryP', 'Mono', 'Gr'))) %>%
filter(Population != 'NA') %>%
ggplot(aes(x = Population, y = Score, fill = Population)) +
geom_boxplot() + geom_jitter() +
theme_pubr(legend = 'none') + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
facet_wrap(.~NMFsig, scales = 'free', ncol = 3)
NA
CBfractions_vst_NMFscores %>%
pivot_longer(-Sample, names_to = 'NMFsig', values_to = 'Score') %>%
mutate(NMFsig = factor(NMFsig, levels = NMFnamed_levels),
Population = Sample %>% str_replace('.*CB_',''),
Population = factor(Population, levels = c('HSC', 'MPP', 'LMPP', 'CMP', 'GMP', 'MLPII', 'EarlyProB', 'PreProB',
'ProB', 'PreB', 'B', #'B1', 'B2', 'B3', 'B4', 'B5', 'B6',
'T', 'NK', 'EryP', 'Mono', 'Gr'))) %>%
# require gene symbol column to be named "Gene"
rpkm_to_logTPM <- function(dat){
# convert to TPM
dat_TPM <- dat %>%
gather(-Gene, key = "Sample", value = "RPKM") %>%
group_by(Sample) %>%
mutate(logTPM = log1p(RPKM / sum(RPKM) * 1000000)) %>%
select(-RPKM) %>% ungroup() %>%
spread(Sample, logTPM)
return(dat_TPM)
}
pharmacotype_fpkm <- data.table::fread('pharmacotypes/pharmacotyping_ped_rnaseq_fpkm_ALLids_0823.csv') %>% select(-GeneID) %>% dplyr::rename(Gene = GeneName)
pharmacotype_fpkm